// bdljsn_stringutil.h                                                -*-C++-*-
#ifndef INCLUDED_BDLJSN_STRINGUTIL
#define INCLUDED_BDLJSN_STRINGUTIL

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide a utility functions for JSON strings.
//
//@CLASSES:
//  bdljsn::StringUtil: namespace for utility functions on JSON strings
//
//@DESCRIPTION: This component defines a utility `struct`,
// `bdljsn::StringUtil`, that is a namespace for functions that convert
// arbitrary UTF-8 codepoint sequences to JSON strings and vice versa.  The
// rules for these conversions are outlined below in {JSON Strings} and
// detailed in: https://www.rfc-editor.org/rfc/rfc8259#section-7 (RFC8259)
//
// This utility provides two key functions:
//
// * `writeString`: Given an arbitrary UTF-8 codepoint sequence, generate a
//   JSON string representing the same codepoints.
// * `readString`: Given a JSON string (e.g., the output of `writeString`),
//   generate the equivalent sequence of UTF-8 code points.
//
// When using these functions, a UTF-8 codepoint sequence is always preserved
// on the round trip to JSON string and back; however, since there are
// equivalent allowed representations of a JSON string, the converse is not
// guaranteed.
//
///JSON Strings
///------------
// JSON strings consist of UTF-8 codepoints surround by double quotes (i.e.,
// '\"') Within those double quotes certain characters *must* be escaped (i.e.,
// replaced with some alternative, multi-byte representation).  Those
// characters are:
//
// * quotation marks
// * backslashes (a.k.a., a "reverse solidus")
// * the "control characters" in the range `U+0000` to `U+001F` (inclusive).
//
// Each of the above characters can be escaped by replacing it with the six
// byte sequence consisting of:
//
//   - a backslash,
//   - a lower-case `u`, and
//   - the Unicode value expressed as four hexadecimal digits.
//
// For example, the character that rings the console bell is represented as
// '\u0007'.  Note that the hexadecimal digits can use upper or lower case
// letters but the lead `u` character must be lower case.  See {Strictness}.
//
// Seven of the characters that must be escaped (and `/`, which *may* be
// escaped) can be alternatively represented by special, 2-byte sequences:
// ```
// +---------+-----------------+---------------+---------------+
// | Unicode | Description     | 6-byte escape | 2-byte escape |
// +---------+-----------------+---------------+---------------+
// | U+0022  | quotation mark  | \u0022        |  \"           |
// | U+005C  | backslash       | \u005c        |  \\           |
// | U+002F  | slash           | \u002f        |  \/           |
// | U+0008  | backspace       | \u0008        |  \b           |
// | U+000C  | form feed       | \u000C        |  \f           |
// | U+000A  | line feed       | \u000A        |  \n           |
// | U+000D  | carriage return | \u000D        |  \r           |
// | U+0009  | tab             | \u0009        |  \t           |
// +---------+-----------------+---------------+---------------+
// ```
// Note that the above set is similar to but not identical to the set of two
// byte `char` literals supported by C++.  For example, '\0' (null) and '\a'
// (bell) are not included above.
//
///Guarantees: Arbitrary UTF-8 to JSON String
/// - - - - - - - - - - - - - - - - - - - - -
// * No UTF-8 characters in the *Basic* *Multilingual* *Plane* are escaped
//   unless they are in the set that *must* be escaped (or are `/` with the
//   appropriate flags).
// * When a character must be escaped, the 6-byte (hexadecimal) representation
//   is used only if no 2-byte escape exists.
// * When a 6-byte (hexadecimal) representation is used, hexadecimal letters
//   are in upper case.
// * All UTF-8 characters outside of the *Basic* *Multilingual* *Plane*
//   are represented by two, adjacent 6-byte hexadecimal escape
//   sequences.  For details, see:
//   https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
//
///Strictness
///----------
// By default, the `bdljsn::StringUtil` read and write methods strictly follow
// the RFC8259 standard.  Variances from those rules are expressed using
// `bdljsn::StringUtil::Flags`, an `enum` of flag values that can be set in the
// optional `flags` parameter of the decoding methods.  Multiple flags can be
// bitwise set in `flags`; however, currently, just one variance flag is
// defined.
//
///Example Variance
/// - - - - - - - -
// RFC8259 specifies that the 6-byte Unicode escape sequence start with a
// backslash, `\`, and lower-case `u`.  However, if the
// `bdljsn::StringUtil::e_ACCEPT_CAPITAL_UNICODE_ESCAPE` is set, an upper-case
// `U` is accepted as well.  Thus, both '\u0007' and '\U0007' would be
// interpreted as the BELL character.
//
///Usage
///-----
// This section illustrates intended use of this component.
//
///Example 1: Encoding and Decoding a JSON String
/// - - - - - - - - - - - - - - - - - - - - - - -
// First, we initialize a string with a valid sequence of UTF-8 codepoints.
// ```
// bsl::string initial("Does the name \"Ivan Pavlov\" ring a bell\a?\n");
// assert(bdlde::Utf8Util::isValid(initial));
// ```
// Notice that, as required by C++ syntax, several characters are represented
// by their two-character escape sequence: double quote (twice), bell, and
// newline.
//
// Then, we examine the string as output:
// ```
// bsl::cout << initial << bsl::endl;
// ```
// and observe:
// ```
// Does the name "Ivan Pavlov" ring a bell?
//
// ```
// Notice that the backslash characters (having served their purpose of giving
// special meaning to the subsequent character) are not shown.  The BELL and
// NEWLINE characters are output but are not visible.
//
// Now, we generate JSON string equivalent of the `initial` string.
// ```
// bsl::ostringstream oss;
//
// int rcEncode = bdljsn::StringUtil::writeString(oss, initial);
// assert(0 == rcEncode);
//
// bsl::string  jsonCompatibleString = oss.str();
// bsl::cout << jsonCompatibleString << bsl::endl;
// ```
// and observed how the `initial` string is represented for JSON:
// ```
// "Does the name \"Ivan Pavlov\" ring a bell\u0007?\n"
// ```
// Notice that:
// * The entire string is delimited by double quotes.
// * The interior double quotes and new line are represented by two character
//   escape sequences (as they were in the C++ string literal.
// * Since JSON does not have a two character escape sequence for the BELL
//   character, '\u0007', the 6-byte Unicode representation is used.
//
// Finally, we convert the `jsonCompatibleString` back to its original content:
// ```
// bsl::string fromJsonString;
// const int   rcDecode = bdljsn::StringUtil::readString(&fromJsonString,
//                                                       jsonCompatibleString);
// assert(0       == rcDecode);
// assert(initial == fromJsonString);
//
// bsl::cout << fromJsonString << bsl::endl;
// ```
// and observe (again):
// ```
// Does the name "Ivan Pavlov" ring a bell?
//
// ```

#include <bdlscm_version.h>

#include <bsls_assert.h>

#include <bsl_ostream.h>
#include <bsl_string.h>
#include <bsl_string_view.h>

namespace BloombergLP {
namespace bdljsn {

                             // =================
                             // struct StringUtil
                             // =================

/// This class provides utility functions for converting arbitrary UTF-8
/// sequences into JSON strings and visa versa.  See [](#JSON Strings) for
/// details of these transformations.
struct StringUtil {

  public:
    // TYPES
    enum Flags {
        e_NONE                          = 0,
        e_ACCEPT_CAPITAL_UNICODE_ESCAPE = 1 << 0,
        e_NO_ESCAPING_FORWARD_SLASH     = 1 << 1
    };

  private:
    // PRIVATE CLASS METHODS

    /// Load to the specified `value` the UTF-8 codepoint sequence equivalent
    /// to the specified (JSON) `string` (see [](#JSON Strings)).  Return 0 on
    /// success and a non-zero value otherwise.  Optionally specify `flags` to
    /// request variances from certain rules of JSON decoding (see
    /// [](#Strictness)).
    template <class STRING>
    static int readStringImp(STRING                  *value,
                             const bsl::string_view&  string,
                             int                      flags);

  public:
    // CLASS METHODS

    /// Load to the specified `value` the UTF-8 codepoint sequence equivalent
    /// to the specified (JSON) `string` (see [](#JSON Strings)).  Return 0 on
    /// success and a non-zero value otherwise.  Optionally specify `flags` to
    /// request variances from certain rules of JSON decoding (see
    /// [](#Strictness)).
    static int readString(bsl::string             *value,
                          const bsl::string_view&  string,
                          int                      flags = e_NONE);
    static int readString(std::string             *value,
                          const bsl::string_view&  string,
                          int                      flags = e_NONE);
#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
    static int readString(std::pmr::string        *value,
                          const bsl::string_view&  string,
                          int                      flags = e_NONE);
#endif

    /// Load to the specified `value` the UTF-8 codepoint sequence equivalent
    /// to the specified `string`, that is JSON-compliant absent the leading
    /// and trailing double quote characters (see [](#JSON Strings)).  Return 0
    /// on success and a non-zero value otherwise.  Optionally specify `flags`
    /// to request variances from certain rules of JSON decoding (see
    /// [](#Strictness)).
    static int readUnquotedString(bsl::string             *value,
                                  const bsl::string_view&  string,
                                  int                      flags = e_NONE);
    static int readUnquotedString(std::string             *value,
                                  const bsl::string_view&  string,
                                  int                      flags = e_NONE);
#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
    static int readUnquotedString(std::pmr::string        *value,
                                  const bsl::string_view&  string,
                                  int                      flags = e_NONE);
#endif

    /// Write to the specified `stream` a JSON-compliant string that is
    /// equivalent to the specified `string`, an arbitrary UTF-8 codepoint
    /// sequence.  Return 0 on success and a non-zero value otherwise.  The
    /// operation fails if `string` is not a sequence of UTF-8 codepoints or if
    /// there is an error writing to `stream`.  See
    /// [](#Guarantees: Arbitrary UTF-8 to JSON String) for further details.
    static int writeString(bsl::ostream&           stream,
                           const bsl::string_view& string,
                           int                     flags = e_NONE);
};

// ============================================================================
//                             INLINE DEFINITIONS
// ============================================================================

                             // -----------------
                             // struct StringUtil
                             // -----------------

// PRIVATE CLASS METHODS
template <class STRING>
inline
int StringUtil::readStringImp(STRING                  *value,
                              const bsl::string_view&  string,
                              int                      flags)
{
    BSLS_ASSERT(value);

    if (2 > string.size()) {
        return -1;                                                    // RETURN
    }

    if (string[0] != '"' && string[string.size() - 1] != '"') {
        return -1;                                                    // RETURN
    }

    const bsl::string_view contents = string.substr(1, string.size() - 2);
    return readUnquotedString(value, contents, flags);
}

// CLASS METHODS

inline
int StringUtil::readString(bsl::string             *value,
                           const bsl::string_view&  string,
                           int                      flags)
{
    return readStringImp(value, string, flags);
}

inline
int StringUtil::readString(std::string             *value,
                           const bsl::string_view&  string,
                           int                      flags)
{
    return readStringImp(value, string, flags);
}

#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
inline
int StringUtil::readString(std::pmr::string        *value,
                           const bsl::string_view&  string,
                           int                      flags)
{
    return readStringImp(value, string, flags);
}
#endif

}  // close package namespace
}  // close enterprise namespace

#endif

// ----------------------------------------------------------------------------
// Copyright 2022 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------
